{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature: Phrase Embedding Distances"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Based on the pre-trained word embeddings, we'll calculate the mean embedding vector of each question (as well as the unit-length normalized sum of word embeddings), and compute vector distances between these aggregate vectors."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This utility package imports `numpy`, `pandas`, `matplotlib` and a helper `kg` module into the root namespace."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pygoose import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models.wrappers.fasttext import FastText\n",
    "from gensim.models import KeyedVectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.spatial.distance import cosine, euclidean, cityblock"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Config"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Automatically discover the paths to various data folders and compose the project structure."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "project = kg.Project.discover()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Identifier for storing these features on disk and referring to them later."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_list_id = 'phrase_embedding'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Preprocessed and tokenized questions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_train.pickle')\n",
    "tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_test.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens = tokens_train + tokens_test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Pretrained word vector database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#embedding_model = FastText.load_word2vec_format(project.aux_dir + 'fasttext_vocab.vec') ### out-of-dated calling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#embedding_model = KeyedVectors.load_word2vec_format(project.aux_dir + \"GoogleNews-vectors-negative300.bin\", binary=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#embedding_model = KeyedVectors.load_word2vec_format(project.aux_dir + 'wiki-news-300d-1M-subword.vec')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_model = KeyedVectors.load_word2vec_format(project.aux_dir + 'fasttext_vocab.vec')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_phrase_embedding_distances(pair):\n",
    "    q1_vectors = [embedding_model[token] for token in pair[0] if token in embedding_model]\n",
    "    q2_vectors = [embedding_model[token] for token in pair[1] if token in embedding_model]\n",
    "\n",
    "    if len(q1_vectors) == 0:\n",
    "        q1_vectors.append(np.zeros(word_vector_dim))\n",
    "    if len(q2_vectors) == 0:\n",
    "        q2_vectors.append(np.zeros(word_vector_dim))\n",
    "        \n",
    "    q1_mean = np.mean(q1_vectors, axis=0)\n",
    "    q2_mean = np.mean(q2_vectors, axis=0)\n",
    "    \n",
    "    q1_sum = np.sum(q1_vectors, axis=0)\n",
    "    q2_sum = np.sum(q2_vectors, axis=0)\n",
    "\n",
    "    q1_norm = q1_sum / np.sqrt((q1_sum ** 2).sum())\n",
    "    q2_norm = q2_sum / np.sqrt((q2_sum ** 2).sum())\n",
    "    \n",
    "    return [\n",
    "        cosine(q1_mean, q2_mean),\n",
    "        np.log(cityblock(q1_mean, q2_mean) + 1),\n",
    "        euclidean(q1_mean, q2_mean),\n",
    "        \n",
    "        cosine(q1_norm, q2_norm),\n",
    "        np.log(cityblock(q1_norm, q2_norm) + 1),\n",
    "        euclidean(q1_norm, q2_norm),        \n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batches:  38%|███▊      | 1045/2751 [1:27:01<2:22:04,  5.00s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n",
      "  \"timeout or by a memory leak.\", UserWarning\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batches:  45%|████▌     | 1249/2751 [1:43:35<2:04:33,  4.98s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n",
      "  \"timeout or by a memory leak.\", UserWarning\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 2751/2751 [4:06:38<00:00,  5.38s/it]  \n"
     ]
    }
   ],
   "source": [
    "distances = kg.jobs.map_batch_parallel(\n",
    "    tokens,\n",
    "    item_mapper=get_phrase_embedding_distances,\n",
    "    batch_size=1000,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "distances = np.array(distances)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = distances[:len(tokens_train)]\n",
    "X_test = distances[len(tokens_train):]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train: (404290, 6)\n",
      "X_test:  (2345796, 6)\n"
     ]
    }
   ],
   "source": [
    "print('X_train:', X_train.shape)\n",
    "print('X_test: ', X_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = [\n",
    "    'phrase_emb_mean_cosine',\n",
    "    'phrase_emb_mean_cityblock_log',\n",
    "    'phrase_emb_mean_euclidean',\n",
    "    \n",
    "    'phrase_emb_normsum_cosine',\n",
    "    'phrase_emb_normsum_cityblock_log',\n",
    "    'phrase_emb_normsum_euclidean',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "project.save_features(X_train, X_test, feature_names, feature_list_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
